import torch
import matplotlib.pyplot as plt
from IPython.display import Audio
from utils import plot_spectrogram

#TODO: changeme
%cd /Users/janne/git/tutorial/codes
# from codes.data_loader import GTZANLoader
/Users/janne/git/tutorial/codes

Audio Data Augmentations

In this chapter, we will discuss common transformations that we can apply to audio signals. We will refer to these as “audio data augmentations”.

Data augmentations are a set of methods that add modified copies to a dataset, from the existing data. This process creates many variations of natural data, and can act as a regulariser to reduce the problem of overfitting. It can also help deep neural networks become robust to complex variations of natural data, which improves their generalisation performance.

In the field of computer vision, the transformations that we apply to images are often very self-explanatory. Take this image, for example. It becomes fairly obvious that we have applied various amounts of gaussian blurring on this image.

alt text

Naturally, we cannot translate transformations from the vision domain directly to the audio domain. Before we explore a battery of audio data augmentations, we now list the currently available code libraries:

Code Libraries

Name

Author

Framework

Language

License

Link

Muda

B. McFee et al. (2015)

General Purpose

Python

ISC License

source code

Audio Degradation Toolbox

M. Mauch et al. (2013)

General Purpose

MATLAB

GNU General Public License 2.0

source code

rubberband

-

General Purpose

C++

GNU General Public License (non-commercial)

website, pyrubberband

audiomentations

I. Jordal (2021)

General Purpose

Python

MIT License

source code

tensorflow-io

tensorflow.org

TensorFlow

Python

Apache 2.0 License

tutorial

torchaudio

pytorch.org

PyTorch

Python

BSD 2-Clause “Simplified” License

source code

torch-audiomentations

Asteroid (2021)

PyTorch

Python

MIT License

source code

torchaudio-augmentations

J. Spijkervet (2021)

PyTorch

Python

MIT License

source code

Listening

One of the most essential, and yet overlooked, parts of music research is exploring and observing the data. This also applies to data augmentation research: one has to develop a general understanding of the effect of transformations that can be applied to audio. Even more so, when transformations are applied sequentially.

For instance, we will understand why a reverb applied before a frequency filter will sound different than when the reverb is applied after the frequency filter. Before we develop this intuition, let’s listen to a series of audio data augmenations.

from torchaudio.datasets import GTZAN
dataset = GTZAN(root=".", download=True)
print(len(dataset))
audio, sr, genre = dataset[5]

print(f"Genre: {genre}\nSample rate: {sr}\nChannels: {audio.shape[0]}\nSamples: {audio.shape[1]}")
display(Audio(audio, rate=sr))

Augmentation Modules

from torchaudio_augmentations import Compose, ComposeMany
from torchaudio_augmentations import (
    Delay,
    Gain,
    HighLowPass,
    Noise,
    PitchShift,
    PolarityInversion,
    RandomResizedCrop,
    Reverb,
)
import math
l = 1/440.0
test_audio = torch.sin(math.tau * 440.0 * torch.linspace(0, l, int(l*sr))).unsqueeze(0)
plt.plot(test_audio.squeeze(0))
plt.show()


inverted_test_audio = PolarityInversion()(test_audio)
plt.plot(inverted_test_audio.squeeze(0))
plt.show()
../_images/data-augmentation_nb_8_0.png ../_images/data-augmentation_nb_8_1.png

Compose Module

transform = Compose([
    Delay(
        sample_rate=sr,
        volume_factor=0.5,
        min_delay=100,
        max_delay=500,
        delay_interval=1,
    )])

print("Transform:", transform)
transformed_audio = transform(audio)
display(Audio(audio, rate=sr))
display(Audio(transformed_audio, rate=sr))
Transform: Compose(
	Delay()
)

Stack audio augmentations

# 4 seconds of audio
num_samples = sr * 4

transforms = [
    RandomResizedCrop(n_samples=num_samples),
    HighLowPass(
        sample_rate=sr,
        lowpass_freq_low=2200,
        lowpass_freq_high=4000,
        highpass_freq_low=200,
        highpass_freq_high=1200,
    ),
    Delay(
        sample_rate=sr,
        volume_factor=0.5,
        min_delay=100,
        max_delay=500,
        delay_interval=1,
    )
]
transform = Compose(transforms)

print("Transform:", transform)
transformed_audio = transform(audio)

display(Audio(transformed_audio, rate=sr))
Transform: Compose(
	RandomResizedCrop()
	HighLowPass()
	Delay()
)

Return multiple augmented samples

# we want 4 augmented samples from ComposeMany
num_augmented_samples = 4

transform = ComposeMany(transforms, num_augmented_samples=num_augmented_samples)

print("Transform:", transform)
transformed_audio = transform(audio)
for ta in transformed_audio:
    plot_spectrogram(ta, sr, title="")
    display(Audio(ta, rate=sr))
plt.show()
Transform: ComposeMany(
	RandomResizedCrop()
	HighLowPass()
	Delay()
)
../_images/data-augmentation_nb_14_1.png
../_images/data-augmentation_nb_14_3.png
../_images/data-augmentation_nb_14_5.png
../_images/data-augmentation_nb_14_7.png

Stochastic Audio Data Augmentations

transforms = [
    PolarityInversion(),
    PitchShift(sample_rate=sr, n_samples=audio.shape[1]),
    Reverb(sample_rate=sr)
]

stochastic_transforms = [
    RandomApply(transforms, p=0.5)
]
transform = Compose(stochastic_transforms)
print(transform)
transformed_audio = transform(audio)
display(Audio(transformed_audio, rate=sr))

Audio chain stochastic augmentations

from torchaudio_augmentations import RandomApply

# we want 4 augmented samples from ComposeMany
num_augmented_samples = 4

# 4 seconds of audio
num_samples = sr * 4

stochastic_transforms = [
    RandomResizedCrop(n_samples=num_samples),
    
    # apply with p = 0.3
    RandomApply([
            PolarityInversion(),
            HighLowPass(
                sample_rate=sr,
                lowpass_freq_low=2200,
                lowpass_freq_high=4000,
                highpass_freq_low=200,
                highpass_freq_high=1200,
            ),
            Delay(
                sample_rate=sr,
                volume_factor=0.5,
                min_delay=100,
                max_delay=500,
                delay_interval=1,
            ),
        ], 
        p=0.3),
    
    # apply with p = 0.8
    RandomApply([
            PitchShift(sample_rate=sr, n_samples=num_samples),
            Gain(),
            Noise(max_snr=0.01),
            Reverb(sample_rate=sr)
        ],
        p=0.8)
]
transform = ComposeMany(stochastic_transforms, num_augmented_samples=num_augmented_samples)

print("Transform:", transform)

for ta in transformed_audio:
    display(Audio(ta, rate=sr))
plt.show()
Transform: ComposeMany(
	RandomResizedCrop()
	RandomApply(
    p=0.3
    PolarityInversion()
    HighLowPass()
    Delay()
)
	RandomApply(
    p=0.8
    <torchaudio_augmentations.augmentations.pitch_shift.PitchShift object at 0x7fda773b7d60>
    Gain()
    Noise()
    Reverb()
)
)

Single stochastic augmentations

# we want 4 augmented samples from ComposeMany
num_augmented_samples = 4

# 4 seconds of audio
num_samples = sr * 4


# define our stochastic augmentations
transforms = [
    RandomResizedCrop(n_samples=num_samples),
    RandomApply([PolarityInversion()], p=0.8),
    RandomApply([HighLowPass(sample_rate=sr)], p=0.6),
    RandomApply([Delay(sample_rate=sr)], p=0.6),
    RandomApply([PitchShift(sample_rate=sr, n_samples=num_samples)], p=0.3),
    RandomApply([Gain()], p=0.6),
    RandomApply([Noise(max_snr=0.01)], p=0.3),
    RandomApply([Reverb(sample_rate=sr)], p=0.5)
]


transform = ComposeMany(transforms, num_augmented_samples=num_augmented_samples)

print("Transform:", transform)
transformed_audio = transform(audio)

for ta in transformed_audio:
    plot_spectrogram(ta, sr, title=e="")
    display(Audio(ta, rate=sr))
plt.show()
Transform: ComposeMany(
	RandomResizedCrop()
	RandomApply(
    p=0.8
    PolarityInversion()
)
	RandomApply(
    p=0.6
    HighLowPass()
)
	RandomApply(
    p=0.6
    Delay()
)
	RandomApply(
    p=0.3
    <torchaudio_augmentations.augmentations.pitch_shift.PitchShift object at 0x7fda7980f1f0>
)
	RandomApply(
    p=0.6
    Gain()
)
	RandomApply(
    p=0.3
    Noise()
)
	RandomApply(
    p=0.5
    Reverb()
)
)
../_images/data-augmentation_nb_20_1.png
../_images/data-augmentation_nb_20_3.png
../_images/data-augmentation_nb_20_5.png
../_images/data-augmentation_nb_20_7.png